--Use Cases 1: Calculate total number of sessions group by access date, landing and exit Page

SELECT edate, landPage, exitPage, COUNT(DISTINCT sessionid) as total_sessions
FROM ( SELECT sessionid, edate, 
		first_value(page) over (partition by sessionid) as landPage, 
		last_value(page) over (partition by sessionid) as exitPage 
		FROM ( SELECT page, edate, sessionid, visitedtime, 
				count(*) over (PARTITION BY sessionid) as c, 
				rank() over (PARTITION BY sessionid order by visitedtime asc) as r 
				FROM customer_action ) a 
WHERE r = 1 or r = c ) b 
GROUP BY edate, landPage, exitPage;


--Use Case 2: Calculate total number of sessions which only access single page

SELECT page, edate, sum(case when sc=1 then 1 else 0 end) as singlepage_count, count(1) as total_sessions 
FROM ( SELECT sessionid, page, edate, visitedtime, 
		count(*) over (PARTITION BY sessionid) as sc, 
		rank() over (PARTITION BY sessionid order by visitedtime asc) as srank 
		FROM customer_action) t 
WHERE srank = 1 
GROUP by page, edate;


--Use Case 3: Calculate total number of new users and repeat users by access date


SELECT edate, sum(case when previous_c=1 then 1 else 0 end) new_users, sum(case when previous_c>1 then 1 else 0 end) repeat_users 
FROM ( SELECT userid, edate, current_c,
		count(*) over (PARTITION BY userid order by edate ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) previous_c	
		FROM ( SELECT userid, edate, 
				count(*) over (PARTITION BY userid, edate) as current_c,
				rank() over (PARTITION BY userid, edate order by visitedtime) as rank 				
				FROM customer_action ORDER BY userid, edate) p1 
		WHERE rank = 1
		ORDER BY userid, edate) p2
GROUP BY edate ORDER BY edate;


--Use Case 4: Calculate the path to purchase group by access date and session


SELECT edate, sessionid, collect_set(page) as path_to_purchase 
FROM ( SELECT sessionid, edate, page, 
              last_value(page) over(PARTITION BY sessionid) as last_page 
	   FROM customer_action)a 
WHERE last_page = 'product'
GROUP BY edate, sessionid;


--Use Case 5: Calculate most frequent next action for the users

SELECT page, next_page, c 
FROM( SELECT sessionid, page, 
		lead(page,1) OVER (PARTITION BY sessionid order by visitedtime asc) as next_page, 
		count(*) OVER (PARTITION BY sessionid order by visitedtime asc) as c, 
		rank() OVER (PARTITION BY sessionid order by visitedtime asc) as page_view 
	  FROM customer_action) a 


--Use Case 6: Predict the airline delay

1 Data Preparation

1.1 Upload the data file from sandbox into HDFS

hadoop fs -mkdir /tmp/airflightsdelays/flight2007
hadoop fs -put /root/TrainingOnHDP/dataset/flights_2007.csv /tmp/airflightsdelays/flight2007/flights_2007.csv

hadoop fs -mkdir /tmp/airflightsdelays/flight2008
hadoop fs -put /root/TrainingOnHDP/dataset/flights_2008.csv /tmp/airflightsdelays/flight2008/flights_2008.csv

hadoop fs -mkdir /tmp/airflightsdelays/weather2007
hadoop fs -put /root/TrainingOnHDP/dataset/weather_2007.csv /tmp/airflightsdelays/weather2007/weather_2007.csv

hadoop fs -mkdir /tmp/airflightsdelays/weather2008
hadoop fs -put /root/TrainingOnHDP/dataset/weather_2008.csv /tmp/airflightsdelays/weather2008/weather_2008.csv


1.2 Create and populate the tables

CREATE EXTERNAL TABLE IF NOT EXISTS flight_2007(
   year int,
   month int,
   day int,
   week int,
   deptime string,
   crsdeptime string,
   arrtime string,
   crsarrtime string,
   uniquecarrier string,
   flightnum string,
   tailnum string,
   actualelapsedtime int,
   crselapsedtime int,
   airtime int,
   arrdelay int,
   depdelay int,
   origin string,
   dest string,
   distance int,
   taxiin int,
   taxiout int,
   cancelled int,
   cancelledcode string,
   Diverted int,
   CarrierDelay int,
   WeatherDelay int,
   NASDelay int,
   SecurityDelay int,
   LateAircraftDel int
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION '/tmp/airflightsdelays/flight2007'
tblproperties ("skip.header.line.count"="1");


CREATE TABLE flight_2007_enrich AS
select concat(year, if(month<10, concat(0,month), month), if(day<10, concat(0,day), day)) as fdate, year, month, day, week, if(length(crsdeptime)=4, substr(crsdeptime,1,2), substr(crsdeptime,1,1)) as crshour, (case when depDelay > 15 then 1 else 0 end) as status,  crsdeptime, depdelay, origin, dest, distance, cancelled from flight_2007 where cancelled = 0 and origin = "ORD";

CREATE EXTERNAL TABLE IF NOT EXISTS flight_2008(
   year int,
   month int,
   day int,
   week int,
   deptime string,
   crsdeptime string,
   arrtime string,
   crsarrtime string,
   uniquecarrier string,
   flightnum string,
   tailnum string,
   actualelapsedtime int,
   crselapsedtime int,
   airtime int,
   arrdelay int,
   depdelay int,
   origin string,
   dest string,
   distance int,
   taxiin int,
   taxiout int,
   cancelled int,
   cancelledcode string,
   Diverted int,
   CarrierDelay int,
   WeatherDelay int,
   NASDelay int,
   SecurityDelay int,
   LateAircraftDel int
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION '/tmp/airflightsdelays/flight2008'
tblproperties ("skip.header.line.count"="1");


CREATE TABLE flight_2008_enrich AS
select concat(year, if(month<10, concat(0,month), month), if(day<10, concat(0,day), day)) as fdate, year, month, day, week, if(length(crsdeptime)=4, substr(crsdeptime,1,2), substr(crsdeptime,1,1)) as crshour, (case when depDelay > 15 then 1 else 0 end) as status,  crsdeptime, depdelay, origin, dest, distance, cancelled from flight_2008 where cancelled = 0 and origin = "ORD";


CREATE EXTERNAL TABLE IF NOT EXISTS weather_2007(
   station string,
   wdate string,
   metrics string,
   mvalue string,
   c1 string,
   c2 string,
   c3 string,
   c4 string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION '/tmp/airflightsdelays/weather2007';


CREATE TABLE weather_2007_join AS
with tmin as (select wdate, mvalue as tmin from weather_2007 where station = "USW00094846" and metrics = "TMIN"),
tmax as (select wdate, mvalue as tmax from weather_2007 where station = "USW00094846" and metrics = "TMAX"),
prcp as (select wdate, mvalue as prcp from weather_2007 where station = "USW00094846" and metrics = "PRCP"),
snow as (select wdate, mvalue as snow from weather_2007 where station = "USW00094846" and metrics = "SNOW"),
awnd as (select wdate, mvalue as awnd from weather_2007 where station = "USW00094846" and metrics = "AWND")
select tmax.wdate, tmax.tmax, tmin.tmin, prcp.prcp, snow.snow, awnd.awnd from tmin left join tmax on tmin.wdate = tmax.wdate left join prcp on tmin.wdate = prcp.wdate
left join snow on tmin.wdate = snow.wdate left join awnd on tmin.wdate = awnd.wdate;


CREATE EXTERNAL TABLE IF NOT EXISTS weather_2008(
   station string,
   wdate string,
   metrics string,
   mvalue string,
   c1 string,
   c2 string,
   c3 string,
   c4 string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION '/tmp/airflightsdelays/weather2008';


CREATE TABLE weather_2008_join AS
with tmin as (select wdate, mvalue as tmin from weather_2008 where station = "USW00094846" and metrics = "TMIN"),
tmax as (select wdate, mvalue as tmax from weather_2008 where station = "USW00094846" and metrics = "TMAX"),
prcp as (select wdate, mvalue as prcp from weather_2008 where station = "USW00094846" and metrics = "PRCP"),
snow as (select wdate, mvalue as snow from weather_2008 where station = "USW00094846" and metrics = "SNOW"),
awnd as (select wdate, mvalue as awnd from weather_2008 where station = "USW00094846" and metrics = "AWND")
select tmax.wdate, tmax.tmax, tmin.tmin, prcp.prcp, snow.snow, awnd.awnd from tmin left join tmax on tmin.wdate = tmax.wdate left join prcp on tmin.wdate = prcp.wdate
left join snow on tmin.wdate = snow.wdate left join awnd on tmin.wdate = awnd.wdate;


add jar /root/TrainingOnHDP/PredictionAirlineDelayOnHive/target/PredictionAirlineDelayOnHive-1.0-SNAPSHOT.jar;
CREATE TEMPORARY FUNCTION calgap as 'ca.training.bigdata.hive.udf.hive.udf.CalHolidaysGap';


calgap UDF java code 

package ca.training.bigdata.hive.udf.hive.udf

public final class CalHolidaysGap  extends UDF {

    String[] holidays = {"01/01/2007", "01/15/2007", "02/19/2007", "05/28/2007", "06/07/2007", "07/04/2007",
            "09/03/2007", "10/08/2007", "11/11/2007", "11/22/2007", "12/25/2007",
            "01/01/2008", "01/21/2008", "02/18/2008", "05/22/2008", "05/26/2008", "07/04/2008",
            "09/01/2008", "10/13/2008", "11/11/2008", "11/27/2008", "12/25/2008"};

    DateFormat dateFormat = new SimpleDateFormat("MM/dd/yyyy");

    public LongWritable evaluate(final IntWritable year, IntWritable month, IntWritable day) {
        Calendar start = Calendar.getInstance();
        start.set(year.get(), month.get(), day.get());
        long startTime = start.getTime().getTime();
        long endTime = 0;
        Date end = null;
        long diffDays = 3000;
        for (String holiday : holidays) {
            try {
                end = (Date) dateFormat.parse(holiday);
            }catch(Exception e){}
            endTime = end.getTime();
            long diffTime = Math.abs(endTime - startTime);
            if (diffDays > diffTime / (1000 * 60 * 60 * 24))
                diffDays = Math.round(diffTime / (1000 * 60 * 60 * 24));
        }
        return new LongWritable(diffDays);
    }
}


CREATE TABLE flight_2007_join AS
select a.status, a.month, a.day, a.week, cast(a.crshour as int) as crshour, a.distance, calgap(a.year, a.month, a.day) as gap, cast(b.tmax as int) as tmax, cast(b.tmin as int) as tmin, cast(b.prcp as int) as prcp, cast(b.snow as int) as snow, cast(b.awnd as int) as awnd from flight_2007_enrich a left join weather_2007_join b on a.fdate = b.wdate;


CREATE TABLE flight_2008_join AS
select a.status, a.month, a.day, a.week, cast(a.crshour as int) as crshour, a.distance, calgap(a.year, a.month, a.day) as gap, cast(b.tmax as int) as tmax, cast(b.tmin as int) as tmin, cast(b.prcp as int) as prcp, cast(b.snow as int) as snow, cast(b.awnd as int) as awnd from flight_2008_enrich a left join weather_2008_join b on a.fdate = b.wdate;


add jar /root/TrainingOnHDP/lib/hivemall-core-0.4.2-rc.2-with-dependencies.jar;
source /root/TrainingOnHDP/lib/define-all.hive;



2 Data Normization (Can be skipped for this lab)

2.1 Min-Max Normalization

select min(month) as min_month, max(month) as max_month, min(day) as min_day, max(day) as max_day, min(week) as min_week, max(week) as max_week,
min(crshour) as min_crshour, max(crshour) as max_crshour, min(distance) as min_distance, max(distance) as max_distance,
min(gap) as min_gap, max(gap) as max_gap, min(tmax) as min_tmax, max(tmax) as max_tmax, min(tmin) as min_tmin, max(tmin) as max_tmin,
min(prcp) as min_prcp, max(prcp) as max_prcp, min(snow) as min_snow, max(snow) as max_snow, min(awnd) as min_awnd, max(awnd) as max_awnd   
from flight_2007_join;


2.2 Feature scaling by zscore

CREATE TABLE flight_2007_stddev AS
select avg(month) as avg_month, stddev_pop(month) as stddev_month, avg(day) as avg_day, stddev_pop(day) as stddev_day, avg(week) as avg_week, stddev_pop(week) as stddev_week,
avg(crshour) as avg_crshour, stddev_pop(crshour) as stddev_crshour, avg(distance) as avg_distance, stddev_pop(distance) as stddev_distance,
avg(gap) as avg_gap, stddev_pop(gap) as stddev_gap, avg(tmax) as avg_tmax, stddev_pop(tmax) as stddev_tmax, avg(tmin) as avg_tmin, stddev_pop(tmin) as stddev_tmin,
avg(prcp) as avg_prcp, stddev_pop(prcp) as stddev_prcp, avg(snow) as avg_snow, stddev_pop(snow) as stddev_snow, avg(awnd) as avg_awnd, stddev_pop(awnd) as stddev_awnd   
from flight_2007_join;


CREATE TABLE flight_2007_scaled AS
select status as label, zscore(month, avg_month, stddev_month) as month, zscore(day, avg_day, stddev_day) as day, zscore(week, avg_week, stddev_week) as week,
zscore(crshour, avg_crshour, stddev_crshour) as crshour, zscore(distance, avg_distance, stddev_distance) as distance,
zscore(gap, avg_gap, stddev_gap) as gap, zscore(tmax, avg_tmax, stddev_tmax) as tmax, zscore(tmin, avg_tmin, stddev_tmin) as tmin,
zscore(prcp, avg_prcp, stddev_prcp) as prcp, zscore(snow, avg_snow, stddev_snow) as snow, zscore(awnd, avg_awnd, stddev_awnd) as awnd
from flight_2007_join cross join flight_2007_stddev;


3. Prepare the training and test dataset

CREATE TABLE flight_2007_training AS
select status as label, array(month, day, week, crshour, distance, gap, tmax, tmin, prcp, snow, awnd) as features
from flight_2007_join;

CREATE TABLE flight_2008_test AS
select row_number() over () as rowid, status as label, array(month, day, week, crshour, distance, gap, tmax, tmin, prcp, snow, awnd) as features
from flight_2008_join;

create table flight_2008_test_exploded as
select rowid, label, feature from flight_2008_test LATERAL VIEW explode(features) t AS feature;


4. Training using Logistic Regression


create table lr_model as
select feature, cast(avg(weight) as float) as weight
from (select logress(features, label, "-total_steps 10000000") as (feature,weight) from flight_2007_training) t 
group by feature;


5. Prediction using test dataset

create table lr_predict as
select t.rowid, t.label, sigmoid(sum(m.weight)) as prob from flight_2008_test_exploded t LEFT OUTER JOIN lr_model m ON (t.feature = m.feature)
group by t.rowid, t.label order by rowid ASC;


6. Generate the prediction metrics and calculate precison, recall and accuracy

create table lr_metrics as
select sum(case when (label = 1 and prob >= 0.5) then 1 else 0 end) as tp, sum(case when (label = 0 and prob < 0.5) then 1 else 0 end) as tn,
sum(case when (label = 1 and prob < 0.5) then 1 else 0 end) as fp, sum(case when (label = 0 and prob >= 0.5) then 1 else 0 end) as fn 
from lr_predict;


select tp/(tp+fp) as precision, tp/(tp+fn) as recall, (tp+tn)/(tp+tn+fp+fn) as accuracy from lr_metrics;






